package com.cmge.ad.spider.article.waduanzi;

import java.util.ArrayList;
import java.util.List;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import com.cmge.ad.model.Article;

/**
 * @desc	挖段子最新
 * 			http://www.waduanzi.com/mobile
 * @author	ljt
 * @time	2014-12-29 上午11:16:05
 */
public class WaLatePageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public static final String URL_LIST = "/mobile/latest/page/\\w+";
    
    // 列表最大值
    private int max = 7276;
    
    private boolean flag = true;
    
    @Override
    public void process(Page page) {
    	// 检索当前页面所有段子
    	List<Selectable> cList = page.getHtml().xpath("//div[@class='panel panel10 post-item post-box']").nodes();
    	List<Article> articleList = new ArrayList<Article>();
		if(null != cList && cList.size() > 0){
			for(Selectable str : cList){
				Article article = new Article();
				String content = str.xpath("//div[@class='item-content']/text()").get();
				String zan = str.xpath("//div[@class='item-toolbar']//a[@class='upscore site-bg']/text()").get();
				String minUrl = str.xpath("//div[@class='post-image']//img[@class='bmiddle']/@src").get();
				String maxUrl = str.xpath("//div[@class='post-image']//a/@data-bmiddle-url").get();
//				String author = str.xpath("//div[@class='post-author']/a/text()").get();
//				String cai = str.xpath("//div[@class='item-toolbar']//a[@class='downscore site-bg']/text()").get();
//				String commentCount = str.xpath("//div[@class='item-toolbar']//li[@class='fright']/a[@class='site-bg comment']/text()").get();

				article.setZan(zan == null ? 0 : Integer.parseInt(zan));
				article.setContent(content.trim());
				article.setMaxImageUrl(maxUrl);
				article.setMinImageUrl(minUrl);
				// 如果内容和图片都为空  则不需要保存
				if(!StringUtils.isEmpty(content) || !StringUtils.isEmpty(minUrl)){
					articleList.add(article);
				}
//				article.setAuthor(author.trim());
//				article.setCai(cai == null ? 0 : Integer.parseInt(cai));

//				if(StringUtils.isEmpty(commentCount) || commentCount.equals("吐槽")){
//					article.setCommentCount(0);
//				}else{
//					article.setCommentCount(Integer.parseInt(commentCount));
//				}
    		}
			page.putField("articleList",articleList);
		}
		
		if(flag){
			for(int i = 2;i<max;i++){
				page.addTargetRequest("http://www.waduanzi.com/mobile/latest/page/"+i);
			}
			flag = false;
		}
		// 当前页
//		String url = page.getUrl().get();
//		int current = 1;
//		try {
//			current = Integer.parseInt(url.substring(url.lastIndexOf("/")+1,url.length()));
//			System.out.println("current is "+current);
//		} catch (Exception e) {
//			e.printStackTrace();
//		}
//		
//		if(current < max){
//			page.addTargetRequest("http://www.waduanzi.com/mobile/latest/page/"+(current+1));
//		}
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) throws Exception {
    	Spider qsSpider = Spider.create(new WaLatePageProcessor())
    					.addUrl("http://www.waduanzi.com/mobile/latest/page/1")
//    					.addPipeline(new RedisPipeline())
//    					.addPipeline(new JsonFilePipeline())
    					.addPipeline(new JsonPipeline())
    					.thread(1);
    	qsSpider.start();
    }
}
